---
title: "Prophage clustering using syntenic Jaccard index"
date: "`r Sys.Date()`"
format:
html:
embed-resources: true
df-print: paged
knitr:
opts_chunk:
fig.height: 7
---
***
This script cluster prophages using syntenic Jaccard index distance.
## Initial setup
```{r}
#| label: setup
#| warning: false
suppressPackageStartupMessages (library (tidyverse))
suppressPackageStartupMessages (library (here))
suppressPackageStartupMessages (library (magrittr))
suppressPackageStartupMessages (library (igraph))
suppressPackageStartupMessages (library (apcluster))
suppressPackageStartupMessages (library (dendextend))
suppressPackageStartupMessages (library (ComplexHeatmap))
suppressPackageStartupMessages (library (ape))
suppressPackageStartupMessages (library (logger))
suppressPackageStartupMessages (library (configr))
# cluster prophages to get representative phages
rm (list = ls ())
source ("https://raw.githubusercontent.com/lakhanp1/omics_utils/main/RScripts/utils.R" )
source ("scripts/utils/config_functions.R" )
source ("scripts/utils/homology_groups.R" )
source ("scripts/utils/heatmap_utils.R" )
source ("scripts/utils/compare_hg_sets.R" )
################################################################################
set.seed (124 )
confs <- prefix_config_paths (
conf = suppressWarnings (configr:: read.config (file = "project_config.yaml" )),
dir = "."
)
pangenome <- confs$ data$ pangenomes$ pectobacterium.v2$ name
panConf <- confs$ data$ pangenomes[[pangenome]]
prophageLenCutoff <- confs$ analysis$ prophages$ cutoff_length
outDir <- confs$ analysis$ prophages$ preprocessing$ dir
colorList <- list (
jaccard = list (
breaks = c (0 , 0.1 , 0.2 , 0.3 , 0.4 , 0.5 , 0.6 , 0.7 , 0.8 , 0.9 , 0.95 , 0.97 , 1 ),
colors = viridisLite:: viridis (n = 13 , option = "magma" )
),
mash = list (
breaks = c (0 , 0.03 , 0.05 , 0.1 , 0.15 , 0.2 , 0.3 , 0.4 , 0.5 , 0.6 , 0.7 , 0.8 , 1 ),
colors = viridisLite:: viridis (n = 13 , option = "magma" )
)
)
```
## Import data
```{r}
sampleInfo <- get_metadata (file = panConf$ files$ metadata, genus = confs$ genus)
sampleInfoList <- as.list_metadata (
df = sampleInfo, sampleId, sampleName, SpeciesName, strain, nodeLabs, genomeId
)
phagesRaw <- suppressMessages (readr:: read_tsv (confs$ data$ prophages$ files$ data)) %>%
dplyr:: select (prophage_id, taxonomy, completeness, checkv_quality, genomeId)
phagesConsolidated <- suppressMessages (
readr:: read_tsv (confs$ analysis$ prophages$ preprocessing$ files$ consolidated)
)
mergedPhages <- dplyr:: filter (phagesConsolidated, filtered != 1 )
# get aditional information for fragmented prophages
fragmented <- dplyr:: filter (mergedPhages, nFragments > 1 ) %>%
dplyr:: filter (jaccardIndex >= 0.5 & perSharedChild >= 0.8 ) %>%
dplyr:: select (prophage_id, fragments, nFragments, prophage_length, nHg, parent) %>%
dplyr:: mutate (fragments = stringr:: str_split (fragments, ";" )) %>%
tidyr:: unnest (fragments) %>%
dplyr:: left_join (
y = phagesRaw, by = c ("fragments" = "prophage_id" )
) %>%
dplyr:: group_by (prophage_id, nFragments, prophage_length, nHg, parent) %>%
dplyr:: summarize (
fragments = paste (fragments, collapse = ";" ),
taxonomy = paste (unique (taxonomy), collapse = "&" ),
completeness = sum (completeness),
checkv_quality = paste ("merged:" , checkv_quality, collapse = "," , sep = " " ),
genomeId = unique (genomeId),
.groups = "drop"
) %>%
dplyr:: mutate (
completeness = pmin (98 , completeness)
)
unfragPhages <- suppressMessages (
readr:: read_tsv (confs$ analysis$ prophages$ preprocessing$ files$ filtered)
) %>%
dplyr:: select (
prophage_id, prophage_length, nHg, genomeId, completeness,
checkv_quality, taxonomy
)
simDf <- suppressMessages (
readr:: read_tsv (confs$ analysis$ prophages$ preprocessing$ files$ pair_comparison)
)
# read prophage HGs stored locally
proHgs <- suppressMessages (
readr:: read_tsv (confs$ analysis$ prophages$ preprocessing$ files$ raw_prophage_hg)
) %>%
dplyr:: select (prophage_id = id, hgs) %>%
dplyr:: mutate (
hgs = stringr:: str_split (hgs, ";" )
)
mashMat <- suppressMessages (
readr:: read_tsv (confs$ analysis$ prophages$ preprocessing$ files$ mash_dist)
) %>%
tibble:: column_to_rownames (var = "prophage_id" ) %>%
as.matrix ()
mashMat <- mashMat[unfragPhages$ prophage_id, unfragPhages$ prophage_id]
mashUpgma <- ape:: read.tree (confs$ analysis$ prophages$ preprocessing$ files$ mash_hclust)
```
## Process syntenic Jaccard similarity and MASH distance data
```{r}
# process data
phageCmpDf <- dplyr:: bind_rows (
simDf,
dplyr:: rename (simDf, p2 = phage1, p1 = phage2) %>%
dplyr:: rename (phage1 = p1, phage2 = p2)
)
allPhageJaccardMat <- tidyr:: pivot_wider (
phageCmpDf,
id_cols = phage1,
names_from = phage2,
values_from = jaccardIndex
) %>%
tibble:: column_to_rownames (var = "phage1" ) %>%
as.matrix ()
jaccardMat <- allPhageJaccardMat[unfragPhages$ prophage_id, unfragPhages$ prophage_id]
if (all (is.na (diag (jaccardMat)))) {
diag (jaccardMat) <- 1
}
if (! isSymmetric (jaccardMat)) {
stop ("pairwise Jaccard index matrix is not symmetric" )
}
# convert to distance matrix
jacDistMat <- max (jaccardMat) - jaccardMat
quantile (jaccardMat, c (0 , 0.25 , 0.5 , 0.75 , seq (0.9 , 0.99 , by = 0.01 ), 0.995 , 1 ))
```
## Hierarchical clustering of Jaccard similarity
:::{.callout-important}
To remove the noise while clustering, exclude all the prophages that show
syntenic Jaccard index 0.5 or lower against other prophages.
```{r}
breakPoint <- 0.5
tempJcMat <- jaccardMat
diag (tempJcMat) <- NA
maxJc <- matrixStats:: rowMaxs (tempJcMat, na.rm = TRUE , useNames = TRUE )
noisyNodes <- which (maxJc <= breakPoint)
trimmedNodes <- which (maxJc > breakPoint)
trimmedJaccardMat <- jaccardMat[unname (trimmedNodes), unname (trimmedNodes)]
```
:::
A heatmap showing the syntenic Jaccard index between the singleton nodes
identified above and the remaining nodes that will be used for clustering.
```{r}
#| column: page
#| fig-height: 7
#| fig-width: 13
#| out-width: '100%'
#| layout-valign: top
ht_noise <- Heatmap (
matrix = jaccardMat[unname (noisyNodes), unname (trimmedNodes)],
name = "noisy_jaccard" ,
column_title = "selected nodes for clustering" ,
row_title = "excluded singleton prophages" ,
col = circlize:: colorRamp2 (
breaks = colorList$ mash$ breaks, colors = colorList$ mash$ colors
),
heatmap_legend_param = list (
direction = "horizontal" , legend_width = unit (5 , "cm" )
),
use_raster = TRUE , raster_quality = 3 ,
show_row_names = FALSE , show_column_names = FALSE
)
ComplexHeatmap:: draw (
ht_noise,
column_title = "Syntenic Jaccard index" ,
column_title_gp = gpar (fontsize = 16 , fontface = "bold" ),
heatmap_legend_side = "bottom"
)
```
:::{.callout-important}
Noisy nodes identified above will be excluded from the hierarchical clustering
of the prophages based on syntenic Jaccard index. Later, these noisy nodes will
be added as singletons to the clusters identified by `hclust` .
:::
```{r}
phageHc <- hclust (
as.dist (jacDistMat[unname (trimmedNodes), unname (trimmedNodes)]),
method = "complete"
)
# plot(phageHc, hang = -1)
# hclust(as.dist(jacDistMat), method = "ward.D2") %>%
# as.dendrogram() %>%
# dendextend::ladderize() %>%
# plot(horiz = TRUE)
phageDend <- as.dendrogram (phageHc) %>%
dendextend:: ladderize ()
phageDend %>%
dendextend:: get_nodes_attr ("height" ) %>%
hist ()
# dendextend::cutree(as.dendrogram(blacl), h = 0.8) %>% table()
phagePhylo <- ape:: as.phylo (phageDend)
```
```{r}
#| fig-height: 10
#| fig-width: 7
#| out-width: '100%'
#| layout-valign: top
rev (phageDend) %>%
plot (
horiz = TRUE ,
main = "Hierarchical clustering of syntenic Jaccard distance"
)
```
## Visualize the clusters with data
### Syntenic Jaccard distance
```{r}
ht_jaccard <- plot_species_ANI_heatmap (
mat = jaccardMat[phagePhylo$ tip.label, phagePhylo$ tip.label],
phy = phagePhylo,
name = "jaccard" ,
column_title = "Syntenic Jaccard index" ,
col = circlize:: colorRamp2 (
breaks = colorList$ jaccard$ breaks, colors = colorList$ jaccard$ colors
),
show_column_dend = TRUE , column_dend_height = unit (3 , "cm" ),
heatmap_legend_param = list (
direction = "horizontal" , legend_width = unit (5 , "cm" )
),
use_raster = TRUE , raster_quality = 2
)
```
```{r}
#| echo: false
#| column: page
#| fig-height: 8
#| fig-width: 10
#| out-width: '100%'
#| layout-valign: top
ComplexHeatmap:: draw (
ht_jaccard,
heatmap_legend_side = "bottom"
)
```
### Add MASH distance heatmap
```{r}
ht_mash <- plot_species_ANI_heatmap (
mat = mashMat[phagePhylo$ tip.label, phagePhylo$ tip.label],
phy = phagePhylo,
name = "mash" ,
column_title = "MASH distance" ,
show_column_dend = FALSE ,
col = circlize:: colorRamp2 (
breaks = colorList$ mash$ breaks, colors = colorList$ mash$ colors
),
heatmap_legend_param = list (
direction = "horizontal" , legend_width = unit (5 , "cm" )
),
use_raster = TRUE , raster_quality = 3
)
htList <- ht_jaccard + ht_mash
```
```{r}
#| echo: false
#| column: page
#| fig-height: 8
#| fig-width: 15
#| out-width: '200%'
#| layout-valign: top
pdf (
file = file.path (outDir, "prophage_jaccard_clustering.pdf" ),
width = 15 , height = 8
)
ComplexHeatmap:: draw (
htList,
main_heatmap = "jaccard" ,
row_dend_side = "left" ,
heatmap_legend_side = "bottom"
)
dev.off ()
ComplexHeatmap:: draw (
htList,
main_heatmap = "jaccard" ,
row_dend_side = "left" ,
heatmap_legend_side = "bottom"
)
```
## Cut tree to generate clusters
:::{.callout-important}
Here, the noisy nodes will be added as singletons to the prophage clusters.
:::
```{r}
treeCut <- dendextend:: cutree (tree = phageDend, h = 0.66 )
phageGroups <- tibble:: enframe (
treeCut,
name = "prophage_id" , value = "phage_grp"
) %>%
dplyr:: bind_rows (
tibble:: tibble (
prophage_id = names (noisyNodes),
phage_grp = (1 : length (noisyNodes)) + length (unique (treeCut))
)
) %>%
dplyr:: mutate (
phage_grp = paste ("phage_grp_" , phage_grp, sep = "" )
) %>%
dplyr:: left_join (unfragPhages, by = "prophage_id" )
```
Finally, add the `r ` nrow(fragmented)` fragmented prophages which could be mapped
to a prophage in the pool with high quality
(syntenic-Jaccard index >= 0.5 & perSharedChild >= 0.8) to the respective
clusters of their best matching parent prophage.
```{r}
fragmentsToAdd <- dplyr:: left_join (
x = fragmented,
y = dplyr:: select (phageGroups, prophage_id, phage_grp),
by = c ("parent" = "prophage_id" )
) %>%
dplyr:: select (- parent)
phageGroups <- dplyr:: bind_rows (phageGroups, fragmentsToAdd) %>%
tidyr:: replace_na (list (nFragments = 1 )) %>%
dplyr:: mutate (
fragments = dplyr:: if_else (is.na (fragments), prophage_id, fragments)
) %>%
dplyr:: select (prophage_id, phage_grp, fragments, nFragments, everything ())
```
Cluster representatives are determined based on the mean Jaccard index of cluster
members against all members. The cluster member with highest mean Jaccard index
against the cluster members is selected as a cluster representative.
Cluster representatives are determined using the following criteria:
- Completeness of the prophages in the cluster determined by checkV (higher -> better)
- mean Jaccard index of the prophage against cluster members
Prophages in the cluster are ranked based on these two criteria and the best
prophage is selected as the cluster representative.
```{r}
# get cluster roots
clusterRoots <- split (x = phageGroups$ prophage_id, f = phageGroups$ phage_grp) %>%
# .[c("phage_grp_114", "phage_grp_12", "phage_grp_131", "phage_grp_132", "phage_grp_173", "phage_grp_174")] %>%
purrr:: map_dfr (
.f = function (x) {
# root -> highest mean Jaccard index across group
if (length (x) == 1 ) {
rm <- setNames (object = 1 , nm = x)
} else {
subJc <- allPhageJaccardMat[x, x]
diag (subJc) <- NA
rm <- matrixStats:: rowMeans2 (subJc, na.rm = TRUE , useNames = TRUE ) %>%
sort (decreasing = TRUE ) %>%
round (digits = 3 )
}
return (
tibble:: tibble (
prophage_id = names (rm),
mean_grp_sim = rm,
grp_size = length (x),
)
)
}
)
```
Combine the clusters with cluster roots.
```{r}
rootedClusters <- dplyr:: left_join (
phageGroups, clusterRoots,
by = "prophage_id"
) %>%
dplyr:: group_by (phage_grp) %>%
dplyr:: arrange (
dplyr:: desc (completeness),
dplyr:: desc (prophage_length),
dplyr:: desc (mean_grp_sim),
.by_group = TRUE
) %>%
dplyr:: mutate (is_root = 1 : n ()) %>%
dplyr:: ungroup () %>%
dplyr:: mutate (
is_root = dplyr:: if_else (is_root == 1 , 1 , 0 )
)
representatives <- dplyr:: filter (rootedClusters, is_root == 1 ) %>%
dplyr:: left_join (y = proHgs, by = "prophage_id" )
rootedClusters %<>% dplyr:: left_join (
y = dplyr:: select (representatives, phage_grp, root_id = prophage_id),
by = "phage_grp"
)
```
### Clustering statistics
Total prophage clusters: `r nrow(representatives)`
Total homology groups in prophage representatives:
`r length(unique(unlist(representatives$hgs)))`
```{r}
phageClusters <- dplyr:: arrange (
rootedClusters,
dplyr:: desc (grp_size), phage_grp, desc (completeness), desc (mean_grp_sim)
) %>%
dplyr:: left_join (
y = dplyr:: select (
sampleInfo, genomeId, sampleId, SpeciesName, nodepath.kmer_upgma,
geo_loc_country, host, isolation_source, env_broad_scale, collection_year
),
by = "genomeId"
)
readr:: write_tsv (
phageClusters,
file = confs$ analysis$ prophages$ files$ clusters
)
```
:::{.callout-note}
What is the variation in genome ANI of the phages in the same group?
:::
If the clustering is good, the representatives prophages should not have high
syntenic Jaccard index against other representative prophages.
```{r}
repJacMat <- jaccardMat[representatives$ prophage_id, representatives$ prophage_id]
diag (repJacMat) <- NA
matrixStats:: rowMaxs (repJacMat, useNames = TRUE , na.rm = TRUE ) %>%
sort (decreasing = TRUE ) %>%
head ()
```
### Draw figures again with representatives
```{r}
repDend <- hclust (
as.dist (
jacDistMat[representatives$ prophage_id, representatives$ prophage_id]
),
method = "complete"
) %>%
as.dendrogram () %>%
dendextend:: ladderize ()
ht_jaccardRep <- plot_species_ANI_heatmap (
mat = jaccardMat[representatives$ prophage_id, representatives$ prophage_id],
phy = repDend,
name = "jaccard" ,
column_title = "Syntenic Jaccard index for representative prophages" ,
col = circlize:: colorRamp2 (
breaks = colorList$ jaccard$ breaks, colors = colorList$ jaccard$ colors
),
use_raster = TRUE , raster_quality = 2 ,
heatmap_legend_param = list (
direction = "horizontal" , legend_width = unit (5 , "cm" )
)
)
ht_mashRep <- plot_species_ANI_heatmap (
mat = mashMat[representatives$ prophage_id, representatives$ prophage_id],
phy = repDend,
name = "mash" ,
column_title = "MASH distance" ,
col = circlize:: colorRamp2 (
breaks = colorList$ mash$ breaks, colors = colorList$ mash$ colors
),
use_raster = TRUE , raster_quality = 2 ,
heatmap_legend_param = list (
direction = "horizontal" , legend_width = unit (5 , "cm" )
)
)
htListRep <- ht_jaccardRep + ht_mashRep
```
```{r}
#| echo: false
#| column: page
#| fig-height: 8
#| fig-width: 15
#| out-width: '200%'
#| layout-valign: top
ComplexHeatmap:: draw (
htListRep,
main_heatmap = "jaccard" ,
row_dend_side = "left" ,
heatmap_legend_side = "bottom"
)
```
:::{.scrolling_y}
### Visualize the final dendrogram
All the `r nrow(representatives)` representatives which includes noisy nodes
(n = `r length(noisyNodes)` ) and cluster representatives (n = `r max(treeCut)` )
obtained by clustering `r length(trimmedNodes)` prophages, are visualized in this
dendrogram. The later `r max(treeCut)` nodes are colored in red.
```{r}
#| column: page
#| fig-height: 30
#| fig-width: 10
#| out-width: '100%'
#| layout-valign: top
rev (repDend) %>%
dendextend:: set (
what = "by_labels_branches_col" ,
value = setdiff (representatives$ prophage_id, names (noisyNodes))
) %>%
dendextend:: set ("labels_cex" , 0.5 ) %>%
plot (
horiz = TRUE ,
main = "Cluster representatives dendrogram"
)
```
:::